/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.solr.analysis;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.RamUsageEstimator;

import java.io.IOException;

/**
 * Splits words into subwords and performs optional transformations on subword
 * groups. Words are split into subwords with the following rules: - split on
 * intra-word delimiters (by default, all non alpha-numeric characters). -
 * "Wi-Fi" -> "Wi", "Fi" - split on case transitions - "PowerShot" -> "Power",
 * "Shot" - split on letter-number transitions - "SD500" -> "SD", "500" -
 * leading and trailing intra-word delimiters on each subword are ignored -
 * "//hello---there, 'dude'" -> "hello", "there", "dude" - trailing "'s" are
 * removed for each subword - "O'Neil's" -> "O", "Neil" - Note: this step isn't
 * performed in a separate filter because of possible subword combinations.
 * 
 * The <b>combinations</b> parameter affects how subwords are combined: -
 * combinations="0" causes no subword combinations. - "PowerShot" -> 0:"Power",
 * 1:"Shot" (0 and 1 are the token positions) - combinations="1" means that in
 * addition to the subwords, maximum runs of non-numeric subwords are catenated
 * and produced at the same position of the last subword in the run. -
 * "PowerShot" -> 0:"Power", 1:"Shot" 1:"PowerShot" - "A's+B's&C's" -> 0:"A",
 * 1:"B", 2:"C", 2:"ABC" - "Super-Duper-XL500-42-AutoCoder!" -> 0:"Super",
 * 1:"Duper", 2:"XL", 2:"SuperDuperXL", 3:"500" 4:"42", 5:"Auto", 6:"Coder",
 * 6:"AutoCoder"
 * 
 * One use for WordDelimiterFilter is to help match words with different subword
 * delimiters. For example, if the source text contained "wi-fi" one may want
 * "wifi" "WiFi" "wi-fi" "wi+fi" queries to all match. One way of doing so is to
 * specify combinations="1" in the analyzer used for indexing, and
 * combinations="0" (the default) in the analyzer used for querying. Given that
 * the current StandardTokenizer immediately removes many intra-word delimiters,
 * it is recommended that this filter be used after a tokenizer that does not do
 * this (such as WhitespaceTokenizer).
 * 
 * @version $Id: WordDelimiterFilter.java 1166766 2011-09-08 15:52:10Z rmuir $
 */

final class WordDelimiterFilter extends TokenFilter {

	public static final int LOWER = 0x01;
	public static final int UPPER = 0x02;
	public static final int DIGIT = 0x04;
	public static final int SUBWORD_DELIM = 0x08;

	// combinations: for testing, not for setting bits
	public static final int ALPHA = 0x03;
	public static final int ALPHANUM = 0x07;

	/**
	 * Causes parts of words to be generated:
	 * <p/>
	 * "PowerShot" => "Power" "Shot"
	 */
	public static final int GENERATE_WORD_PARTS = 1;

	/**
	 * Causes number subwords to be generated:
	 * <p/>
	 * "500-42" => "500" "42"
	 */
	public static final int GENERATE_NUMBER_PARTS = 2;

	/**
	 * Causes maximum runs of word parts to be catenated:
	 * <p/>
	 * "wi-fi" => "wifi"
	 */
	public static final int CATENATE_WORDS = 4;

	/**
	 * Causes maximum runs of word parts to be catenated:
	 * <p/>
	 * "wi-fi" => "wifi"
	 */
	public static final int CATENATE_NUMBERS = 8;

	/**
	 * Causes all subword parts to be catenated:
	 * <p/>
	 * "wi-fi-4000" => "wifi4000"
	 */
	public static final int CATENATE_ALL = 16;

	/**
	 * Causes original words are preserved and added to the subword list
	 * (Defaults to false)
	 * <p/>
	 * "500-42" => "500" "42" "500-42"
	 */
	public static final int PRESERVE_ORIGINAL = 32;

	/**
	 * If not set, causes case changes to be ignored (subwords will only be
	 * generated given SUBWORD_DELIM tokens)
	 */
	public static final int SPLIT_ON_CASE_CHANGE = 64;

	/**
	 * If not set, causes numeric changes to be ignored (subwords will only be
	 * generated given SUBWORD_DELIM tokens).
	 */
	public static final int SPLIT_ON_NUMERICS = 128;

	/**
	 * Causes trailing "'s" to be removed for each subword
	 * <p/>
	 * "O'Neil's" => "O", "Neil"
	 */
	public static final int STEM_ENGLISH_POSSESSIVE = 256;

	/**
	 * If not null is the set of tokens to protect from being delimited
	 * 
	 */
	final CharArraySet protWords;

	private final int flags;

	private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
	private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
	private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
	private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);

	// used for iterating word delimiter breaks
	private final WordDelimiterIterator iterator;

	// used for concatenating runs of similar typed subwords (word,number)
	private final WordDelimiterConcatenation concat = new WordDelimiterConcatenation();
	// number of subwords last output by concat.
	private int lastConcatCount = 0;

	// used for catenate all
	private final WordDelimiterConcatenation concatAll = new WordDelimiterConcatenation();

	// used for accumulating position increment gaps
	private int accumPosInc = 0;

	private char savedBuffer[] = new char[1024];
	private int savedStartOffset;
	private int savedEndOffset;
	private String savedType;
	private boolean hasSavedState = false;
	// if length by start + end offsets doesn't match the term text then assume
	// this is a synonym and don't adjust the offsets.
	private boolean hasIllegalOffsets = false;

	// for a run of the same subword type within a word, have we output
	// anything?
	private boolean hasOutputToken = false;
	// when preserve original is on, have we output any token following it?
	// this token must have posInc=0!
	private boolean hasOutputFollowingOriginal = false;

	/**
	 * Creates a new WordDelimiterFilter
	 * 
	 * @param in
	 *            TokenStream to be filtered
	 * @param charTypeTable
	 *            table containing character types
	 * @param configurationFlags
	 *            Flags configuring the filter
	 * @param protWords
	 *            If not null is the set of tokens to protect from being
	 *            delimited
	 */
	public WordDelimiterFilter(TokenStream in, byte[] charTypeTable,
			int configurationFlags, CharArraySet protWords) {
		super(in);
		this.flags = configurationFlags;
		this.protWords = protWords;
		this.iterator = new WordDelimiterIterator(charTypeTable,
				has(SPLIT_ON_CASE_CHANGE), has(SPLIT_ON_NUMERICS),
				has(STEM_ENGLISH_POSSESSIVE));
	}

	/**
	 * Creates a new WordDelimiterFilter using
	 * {@link WordDelimiterIterator#DEFAULT_WORD_DELIM_TABLE} as its
	 * charTypeTable
	 * 
	 * @param in
	 *            TokenStream to be filtered
	 * @param configurationFlags
	 *            Flags configuring the filter
	 * @param protWords
	 *            If not null is the set of tokens to protect from being
	 *            delimited
	 */
	public WordDelimiterFilter(TokenStream in, int configurationFlags,
			CharArraySet protWords) {
		this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE,
				configurationFlags, protWords);
	}

	/**
	 * @param in
	 *            Token stream to be filtered.
	 * @param charTypeTable
	 *            table containing character types
	 * @param generateWordParts
	 *            If 1, causes parts of words to be generated: "PowerShot" =>
	 *            "Power" "Shot"
	 * @param generateNumberParts
	 *            If 1, causes number subwords to be generated: "500-42" =>
	 *            "500" "42"
	 * @param catenateWords
	 *            1, causes maximum runs of word parts to be catenated: "wi-fi"
	 *            => "wifi"
	 * @param catenateNumbers
	 *            If 1, causes maximum runs of number parts to be catenated:
	 *            "500-42" => "50042"
	 * @param catenateAll
	 *            If 1, causes all subword parts to be catenated: "wi-fi-4000"
	 *            => "wifi4000"
	 * @param splitOnCaseChange
	 *            1, causes "PowerShot" to be two tokens; ("Power-Shot" remains
	 *            two parts regards)
	 * @param preserveOriginal
	 *            If 1, includes original words in subwords: "500-42" => "500"
	 *            "42" "500-42"
	 * @param splitOnNumerics
	 *            1, causes "j2se" to be three tokens; "j" "2" "se"
	 * @param stemEnglishPossessive
	 *            If 1, causes trailing "'s" to be removed for each subword:
	 *            "O'Neil's" => "O", "Neil"
	 * @param protWords
	 *            If not null is the set of tokens to protect from being
	 *            delimited
	 * @deprecated Use
	 *             {@link #WordDelimiterFilter(TokenStream, byte[], int, CharArraySet)}
	 */
	@Deprecated
	public WordDelimiterFilter(TokenStream in, byte[] charTypeTable,
			int generateWordParts, int generateNumberParts, int catenateWords,
			int catenateNumbers, int catenateAll, int splitOnCaseChange,
			int preserveOriginal, int splitOnNumerics,
			int stemEnglishPossessive, CharArraySet protWords) {
		super(in);

		int flags = 0;
		if (generateWordParts != 0) {
			flags |= GENERATE_WORD_PARTS;
		}
		if (generateNumberParts != 0) {
			flags |= GENERATE_NUMBER_PARTS;
		}
		if (catenateWords != 0) {
			flags |= CATENATE_WORDS;
		}
		if (catenateNumbers != 0) {
			flags |= CATENATE_NUMBERS;
		}
		if (catenateAll != 0) {
			flags |= CATENATE_ALL;
		}
		if (preserveOriginal != 0) {
			flags |= PRESERVE_ORIGINAL;
		}
		if (splitOnCaseChange != 0) {
			flags |= SPLIT_ON_CASE_CHANGE;
		}
		if (splitOnNumerics != 0) {
			flags |= SPLIT_ON_NUMERICS;
		}
		if (stemEnglishPossessive != 0) {
			flags |= STEM_ENGLISH_POSSESSIVE;
		}
		this.protWords = protWords;
		this.iterator = new WordDelimiterIterator(charTypeTable,
				splitOnCaseChange != 0, splitOnNumerics != 0,
				stemEnglishPossessive != 0);
		this.flags = flags;
	}

	/**
	 * Compatibility constructor
	 * 
	 * @deprecated Use
	 *             {@link #WordDelimiterFilter(TokenStream, byte[], int, int, int, int, int, int, int, int, int, CharArraySet)}
	 *             instead.
	 */
	@Deprecated
	public WordDelimiterFilter(TokenStream in, byte[] charTypeTable,
			int generateWordParts, int generateNumberParts, int catenateWords,
			int catenateNumbers, int catenateAll, int splitOnCaseChange,
			int preserveOriginal, int splitOnNumerics, CharArraySet protWords) {
		this(in, charTypeTable, generateWordParts, generateNumberParts,
				catenateWords, catenateNumbers, catenateAll, splitOnCaseChange,
				preserveOriginal, 1, 1, null);
	}

	/**
	 * Compatibility constructor
	 * 
	 * @deprecated Use
	 *             {@link #WordDelimiterFilter(TokenStream, byte[], int, int, int, int, int, int, int, int, int, CharArraySet)}
	 *             instead.
	 */
	@Deprecated
	public WordDelimiterFilter(TokenStream in, byte[] charTypeTable,
			int generateWordParts, int generateNumberParts, int catenateWords,
			int catenateNumbers, int catenateAll, int splitOnCaseChange,
			int preserveOriginal) {
		this(in, charTypeTable, generateWordParts, generateNumberParts,
				catenateWords, catenateNumbers, catenateAll, splitOnCaseChange,
				preserveOriginal, 1, null);
	}

	/**
	 * @param in
	 *            Token stream to be filtered.
	 * @param generateWordParts
	 *            If 1, causes parts of words to be generated: "PowerShot",
	 *            "Power-Shot" => "Power" "Shot"
	 * @param generateNumberParts
	 *            If 1, causes number subwords to be generated: "500-42" =>
	 *            "500" "42"
	 * @param catenateWords
	 *            1, causes maximum runs of word parts to be catenated: "wi-fi"
	 *            => "wifi"
	 * @param catenateNumbers
	 *            If 1, causes maximum runs of number parts to be catenated:
	 *            "500-42" => "50042"
	 * @param catenateAll
	 *            If 1, causes all subword parts to be catenated: "wi-fi-4000"
	 *            => "wifi4000"
	 * @param splitOnCaseChange
	 *            1, causes "PowerShot" to be two tokens; ("Power-Shot" remains
	 *            two parts regards)
	 * @param preserveOriginal
	 *            If 1, includes original words in subwords: "500-42" => "500"
	 *            "42" "500-42"
	 * @param splitOnNumerics
	 *            1, causes "j2se" to be three tokens; "j" "2" "se"
	 * @param stemEnglishPossessive
	 *            If 1, causes trailing "'s" to be removed for each subword:
	 *            "O'Neil's" => "O", "Neil"
	 * @param protWords
	 *            If not null is the set of tokens to protect from being
	 *            delimited
	 * @deprecated Use
	 *             {@link #WordDelimiterFilter(TokenStream, int, CharArraySet)}
	 */
	@Deprecated
	public WordDelimiterFilter(TokenStream in, int generateWordParts,
			int generateNumberParts, int catenateWords, int catenateNumbers,
			int catenateAll, int splitOnCaseChange, int preserveOriginal,
			int splitOnNumerics, int stemEnglishPossessive,
			CharArraySet protWords) {
		this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE,
				generateWordParts, generateNumberParts, catenateWords,
				catenateNumbers, catenateAll, splitOnCaseChange,
				preserveOriginal, splitOnNumerics, stemEnglishPossessive,
				protWords);
	}

	/**
	 * @deprecated Use
	 *             {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, int, CharArraySet)}
	 *             instead.
	 */
	@Deprecated
	public WordDelimiterFilter(TokenStream in, int generateWordParts,
			int generateNumberParts, int catenateWords, int catenateNumbers,
			int catenateAll, int splitOnCaseChange, int preserveOriginal,
			int splitOnNumerics, CharArraySet protWords) {
		this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE,
				generateWordParts, generateNumberParts, catenateWords,
				catenateNumbers, catenateAll, splitOnCaseChange,
				preserveOriginal, splitOnNumerics, 1, protWords);
	}

	/**
	 * * Compatibility constructor
	 * 
	 * @deprecated Use
	 *             {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, int, CharArraySet)}
	 *             instead.
	 */
	@Deprecated
	public WordDelimiterFilter(TokenStream in, int generateWordParts,
			int generateNumberParts, int catenateWords, int catenateNumbers,
			int catenateAll, int splitOnCaseChange, int preserveOriginal) {
		this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE,
				generateWordParts, generateNumberParts, catenateWords,
				catenateNumbers, catenateAll, splitOnCaseChange,
				preserveOriginal);
	}

	/**
	 * Compatibility constructor
	 * 
	 * @deprecated Use
	 *             {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, int, CharArraySet)}
	 *             instead.
	 */
	@Deprecated
	public WordDelimiterFilter(TokenStream in, byte[] charTypeTable,
			int generateWordParts, int generateNumberParts, int catenateWords,
			int catenateNumbers, int catenateAll) {
		this(in, charTypeTable, generateWordParts, generateNumberParts,
				catenateWords, catenateNumbers, catenateAll, 1, 0, 1, null);
	}

	/**
	 * Compatibility constructor
	 * 
	 * @deprecated Use
	 *             {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, int, CharArraySet)}
	 *             instead.
	 */
	@Deprecated
	public WordDelimiterFilter(TokenStream in, int generateWordParts,
			int generateNumberParts, int catenateWords, int catenateNumbers,
			int catenateAll) {
		this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE,
				generateWordParts, generateNumberParts, catenateWords,
				catenateNumbers, catenateAll, 1, 0, 1, null);
	}

	@Override
	public boolean incrementToken() throws IOException {
		while (true) {
			if (!hasSavedState) {
				// process a new input word
				if (!input.incrementToken()) {
					return false;
				}

				int termLength = termAttribute.length();
				char[] termBuffer = termAttribute.buffer();

				accumPosInc += posIncAttribute.getPositionIncrement();

				iterator.setText(termBuffer, termLength);
				iterator.next();

				// word of no delimiters, or protected word: just return it
				if ((iterator.current == 0 && iterator.end == termLength)
						|| (protWords != null && protWords.contains(termBuffer,
								0, termLength))) {
					posIncAttribute.setPositionIncrement(accumPosInc);
					accumPosInc = 0;
					return true;
				}

				// word of simply delimiters
				if (iterator.end == WordDelimiterIterator.DONE
						&& !has(PRESERVE_ORIGINAL)) {
					// if the posInc is 1, simply ignore it in the accumulation
					if (posIncAttribute.getPositionIncrement() == 1) {
						accumPosInc--;
					}
					continue;
				}

				saveState();

				hasOutputToken = false;
				hasOutputFollowingOriginal = !has(PRESERVE_ORIGINAL);
				lastConcatCount = 0;

				if (has(PRESERVE_ORIGINAL)) {
					posIncAttribute.setPositionIncrement(accumPosInc);
					accumPosInc = 0;
					return true;
				}
			}

			// at the end of the string, output any concatenations
			if (iterator.end == WordDelimiterIterator.DONE) {
				if (!concat.isEmpty()) {
					if (flushConcatenation(concat)) {
						return true;
					}
				}

				if (!concatAll.isEmpty()) {
					// only if we haven't output this same combo above!
					if (concatAll.subwordCount > lastConcatCount) {
						concatAll.writeAndClear();
						return true;
					}
					concatAll.clear();
				}

				// no saved concatenations, on to the next input word
				hasSavedState = false;
				continue;
			}

			// word surrounded by delimiters: always output
			if (iterator.isSingleWord()) {
				generatePart(true);
				iterator.next();
				return true;
			}

			int wordType = iterator.type();

			// do we already have queued up incompatible concatenations?
			if (!concat.isEmpty() && (concat.type & wordType) == 0) {
				if (flushConcatenation(concat)) {
					hasOutputToken = false;
					return true;
				}
				hasOutputToken = false;
			}

			// add subwords depending upon options
			if (shouldConcatenate(wordType)) {
				if (concat.isEmpty()) {
					concat.type = wordType;
				}
				concatenate(concat);
			}

			// add all subwords (catenateAll)
			if (has(CATENATE_ALL)) {
				concatenate(concatAll);
			}

			// if we should output the word or number part
			if (shouldGenerateParts(wordType)) {
				generatePart(false);
				iterator.next();
				return true;
			}

			iterator.next();
		}
	}

	/**
	 * {@inheritDoc}
	 */
	@Override
	public void reset() throws IOException {
		super.reset();
		hasSavedState = false;
		concat.clear();
		concatAll.clear();
		accumPosInc = 0;
	}

	// ================================================= Helper Methods
	// ================================================

	/**
	 * Saves the existing attribute states
	 */
	private void saveState() {
		// otherwise, we have delimiters, save state
		savedStartOffset = offsetAttribute.startOffset();
		savedEndOffset = offsetAttribute.endOffset();
		// if length by start + end offsets doesn't match the term text then
		// assume this is a synonym and don't adjust the offsets.
		hasIllegalOffsets = (savedEndOffset - savedStartOffset != termAttribute
				.length());
		savedType = typeAttribute.type();

		if (savedBuffer.length < termAttribute.length()) {
			savedBuffer = new char[ArrayUtil.oversize(termAttribute.length(),
					RamUsageEstimator.NUM_BYTES_CHAR)];
		}

		System.arraycopy(termAttribute.buffer(), 0, savedBuffer, 0,
				termAttribute.length());
		iterator.text = savedBuffer;

		hasSavedState = true;
	}

	/**
	 * Flushes the given WordDelimiterConcatenation by either writing its concat
	 * and then clearing, or just clearing.
	 * 
	 * @param concatenation
	 *            WordDelimiterConcatenation that will be flushed
	 * @return {@code true} if the concatenation was written before it was
	 *         cleared, {@code} false otherwise
	 */
	private boolean flushConcatenation(WordDelimiterConcatenation concatenation) {
		lastConcatCount = concatenation.subwordCount;
		if (concatenation.subwordCount != 1
				|| !shouldGenerateParts(concatenation.type)) {
			concatenation.writeAndClear();
			return true;
		}
		concatenation.clear();
		return false;
	}

	/**
	 * Determines whether to concatenate a word or number if the current word is
	 * the given type
	 * 
	 * @param wordType
	 *            Type of the current word used to determine if it should be
	 *            concatenated
	 * @return {@code true} if concatenation should occur, {@code false}
	 *         otherwise
	 */
	private boolean shouldConcatenate(int wordType) {
		return (has(CATENATE_WORDS) && isAlpha(wordType))
				|| (has(CATENATE_NUMBERS) && isDigit(wordType));
	}

	/**
	 * Determines whether a word/number part should be generated for a word of
	 * the given type
	 * 
	 * @param wordType
	 *            Type of the word used to determine if a word/number part
	 *            should be generated
	 * @return {@code true} if a word/number part should be generated,
	 *         {@code false} otherwise
	 */
	private boolean shouldGenerateParts(int wordType) {
		return (has(GENERATE_WORD_PARTS) && isAlpha(wordType))
				|| (has(GENERATE_NUMBER_PARTS) && isDigit(wordType));
	}

	/**
	 * Concatenates the saved buffer to the given WordDelimiterConcatenation
	 * 
	 * @param concatenation
	 *            WordDelimiterConcatenation to concatenate the buffer to
	 */
	private void concatenate(WordDelimiterConcatenation concatenation) {
		if (concatenation.isEmpty()) {
			concatenation.startOffset = savedStartOffset + iterator.current;
		}
		concatenation.append(savedBuffer, iterator.current, iterator.end
				- iterator.current);
		concatenation.endOffset = savedStartOffset + iterator.end;
	}

	/**
	 * Generates a word/number part, updating the appropriate attributes
	 * 
	 * @param isSingleWord
	 *            {@code true} if the generation is occurring from a single
	 *            word, {@code false} otherwise
	 */
	private void generatePart(boolean isSingleWord) {
		clearAttributes();
		termAttribute.copyBuffer(savedBuffer, iterator.current, iterator.end
				- iterator.current);

		int startOffSet = (isSingleWord || !hasIllegalOffsets) ? savedStartOffset
				+ iterator.current
				: savedStartOffset;
		int endOffSet = (hasIllegalOffsets) ? savedEndOffset : savedStartOffset
				+ iterator.end;

		offsetAttribute.setOffset(startOffSet, endOffSet);
		posIncAttribute.setPositionIncrement(position(false));
		typeAttribute.setType(savedType);
	}

	/**
	 * Get the position increment gap for a subword or concatenation
	 * 
	 * @param inject
	 *            true if this token wants to be injected
	 * @return position increment gap
	 */
	private int position(boolean inject) {
		int posInc = accumPosInc;

		if (hasOutputToken) {
			accumPosInc = 0;
			return inject ? 0 : Math.max(1, posInc);
		}

		hasOutputToken = true;

		if (!hasOutputFollowingOriginal) {
			// the first token following the original is 0 regardless
			hasOutputFollowingOriginal = true;
			return 0;
		}
		// clear the accumulated position increment
		accumPosInc = 0;
		return Math.max(1, posInc);
	}

	/**
	 * Checks if the given word type includes {@link #ALPHA}
	 * 
	 * @param type
	 *            Word type to check
	 * @return {@code true} if the type contains ALPHA, {@code false} otherwise
	 */
	static boolean isAlpha(int type) {
		return (type & ALPHA) != 0;
	}

	/**
	 * Checks if the given word type includes {@link #DIGIT}
	 * 
	 * @param type
	 *            Word type to check
	 * @return {@code true} if the type contains DIGIT, {@code false} otherwise
	 */
	static boolean isDigit(int type) {
		return (type & DIGIT) != 0;
	}

	/**
	 * Checks if the given word type includes {@link #SUBWORD_DELIM}
	 * 
	 * @param type
	 *            Word type to check
	 * @return {@code true} if the type contains SUBWORD_DELIM, {@code false}
	 *         otherwise
	 */
	static boolean isSubwordDelim(int type) {
		return (type & SUBWORD_DELIM) != 0;
	}

	/**
	 * Checks if the given word type includes {@link #UPPER}
	 * 
	 * @param type
	 *            Word type to check
	 * @return {@code true} if the type contains UPPER, {@code false} otherwise
	 */
	static boolean isUpper(int type) {
		return (type & UPPER) != 0;
	}

	/**
	 * Determines whether the given flag is set
	 * 
	 * @param flag
	 *            Flag to see if set
	 * @return {@code} true if flag is set
	 */
	private boolean has(int flag) {
		return (flags & flag) != 0;
	}

	// ================================================= Inner Classes
	// =================================================

	/**
	 * A WDF concatenated 'run'
	 */
	final class WordDelimiterConcatenation {
		final StringBuilder buffer = new StringBuilder();
		int startOffset;
		int endOffset;
		int type;
		int subwordCount;

		/**
		 * Appends the given text of the given length, to the concetenation at
		 * the given offset
		 * 
		 * @param text
		 *            Text to append
		 * @param offset
		 *            Offset in the concetenation to add the text
		 * @param length
		 *            Length of the text to append
		 */
		void append(char text[], int offset, int length) {
			buffer.append(text, offset, length);
			subwordCount++;
		}

		/**
		 * Writes the concatenation to the attributes
		 */
		void write() {
			clearAttributes();
			if (termAttribute.length() < buffer.length()) {
				termAttribute.resizeBuffer(buffer.length());
			}
			char termbuffer[] = termAttribute.buffer();

			buffer.getChars(0, buffer.length(), termbuffer, 0);
			termAttribute.setLength(buffer.length());

			if (hasIllegalOffsets) {
				offsetAttribute.setOffset(savedStartOffset, savedEndOffset);
			} else {
				offsetAttribute.setOffset(startOffset, endOffset);
			}
			posIncAttribute.setPositionIncrement(position(true));
			typeAttribute.setType(savedType);
			accumPosInc = 0;
		}

		/**
		 * Determines if the concatenation is empty
		 * 
		 * @return {@code true} if the concatenation is empty, {@code false}
		 *         otherwise
		 */
		boolean isEmpty() {
			return buffer.length() == 0;
		}

		/**
		 * Clears the concatenation and resets its state
		 */
		void clear() {
			buffer.setLength(0);
			startOffset = endOffset = type = subwordCount = 0;
		}

		/**
		 * Convenience method for the common scenario of having to write the
		 * concetenation and then clearing its state
		 */
		void writeAndClear() {
			write();
			clear();
		}
	}
	// questions:
	// negative numbers? -42 indexed as just 42?
	// dollar sign? $42
	// percent sign? 33%
	// downsides: if source text is "powershot" then a query of "PowerShot"
	// won't match!
}
